# Set-up

## Clear memory
rm(list=ls())

## Load packages
if (!require("pacman")) install.packages("pacman")
pacman::p_load(
  tidyverse, readxl, data.table, stringr, collapse, modelsummary, knitr, kableExtra, haven, usmap, tigris
)

## Edit AP3 to reflect 1997 population. 

## The AP3 model takes three inputs. 

### 1. Population. 
### 2. Mortality rates.
### 3. Pollution. 

fips_order = fread("raw/modified-ap3/fips.csv") 
setnames(fips_order, 'V1', 'fips')



### Population

# There are19 age cateogories in SEER.


# Source: https://rdrr.io/github/GerkeLab/fcds/src/data-raw/02_seer-pop-florida-county.R
# Recode Functions --------------------------------------------------------
# Data dictionary: https://seer.cancer.gov/popdata/popdic.html
recode_registry <- c("01" = "San Francisco-Oakland SMSA",
                     "02" = "Connecticut",
                     "20" = "Detroit (Metropolitan)",
                     "21" = "Hawaii",
                     "22" = "Iowa",
                     "23" = "New Mexico",
                     "25" = "Seattle (Puget Sound)",
                     "26" = "Utah",
                     "27" = "Atlanta (Metropolitan)",
                     "29" = "Alaska Natives",
                     "31" = "San Jose-Monterey",
                     "33" = "Arizona Indians",
                     "35" = "Los Angeles",
                     "37" = "Rural Georgia",
                     "41" = "California excluding SF/SJM/LA",
                     "42" = "Kentucky",
                     "43" = "Louisiana",
                     "44" = "New Jersey",
                     "47" = "Georgia excluding Atlanta/Rural Georgia",
                     "99" = "Registry for non-SEER area")
recode_race_1969 <- c("1" = "White",
                      "2" = "Black",
                      "3" = "Other")
recode_race_1990 <- c("1" = "White",
                      "2" = "Black",
                      "3" = "Other",
                      "4" = "Other")
# "3" = "American Indian/Alaska Native",
# "4" = "Asian or Pacific Islander")
recode_origin_1990 <- c("0" = "Non-Hispanic",
                        "1" = "Hispanic",
                        "9" = NA_character_)
# "9" = "Not applicable in 1969-2011 W,B,O files")
recode_sex <- c("1" = "Male", "2" = "Female")

age_18_groups_by_index <- function(i) {
  if (i == 0) return("0")
  if (i == 1) return("0 - 4")
  if ((i-1) * 5 == 85) return("85+")
  paste((i-1) * 5, i * 5 - 1, sep = " - ")
}
recode_age_groups <- map_chr(0:18, age_18_groups_by_index)
names(recode_age_groups) <- sprintf("%02d", 0:18)

recode_fct <- function(x, recode_levels) {
  factor(x, levels = names(recode_levels), labels = unname(recode_levels))
}

# Read SEER Fixed Width File ----------------------------------------------
read_seer_fwf <- function(file, ...) {
  readr::read_fwf(
    file,
    col_positions = readr::fwf_widths(
      widths = c(4, 2, 2, 3, 2, 1, 1, 1, 2, 8),
      col_names = c("year", "state", "state_fips", "county_fips",
                    "registry", "race", "origin", "sex", "age_group", "population")
    ),
    col_types = readr::cols(
      population = readr::col_integer(),
      .default = readr::col_character()
    ),
    ...
  )
}

seer_19_ages = read_seer_fwf(file = "raw/seer/us.1990_2016.19ages.adjusted.txt")

# Let's create and save the file for 1997. 
seer_19_1997 = seer_19_ages %>%
    filter(year == 1997)

seer_19_1997 = collap(seer_19_1997, 
                        by = ~ state_fips + county_fips + age_group , 
                        fsum) %>%
                pivot_wider( id_cols = c("state_fips", "county_fips"), 
                                  values_from = population, 
                                  names_from = age_group, 
                                  names_prefix = "age_group_") %>%
    mutate(fips = as.numeric(paste0(state_fips, county_fips)))

seer_19_1997 = fips_order %>%
    left_join(seer_19_1997) %>%
    select(starts_with("age"))



# Write the file 
# fwrite(seer_19_1997, file = "data/modified-ap3/pop_1997.csv", col.names = FALSE)


# Create nationally representitive prime aged worker share

# First create average among prime aged workers in each category

temp = t(colSums(seer_19_1997, na.rm = TRUE))%>%
    as_tibble() %>%
    mutate(age_group_00 = 0) %>%
    mutate(age_group_01 = 0) %>%
    mutate(age_group_02 = 0) %>%
    mutate(age_group_03 = 0) %>%
    mutate(age_group_04 = 0) %>%
    mutate(age_group_13 = 0) %>%
    mutate(age_group_14 = 0) %>%
    mutate(age_group_15 = 0) %>%
    mutate(age_group_16 = 0) %>%
    mutate(age_group_17 = 0) %>%
    mutate(age_group_18 = 0) %>%
    mutate(total_prime_aged = 
               age_group_05 + age_group_06 + 
               age_group_07 + age_group_08 + 
               age_group_09 + age_group_10 + 
               age_group_11 + age_group_12) %>%
    mutate(age_group_05 = age_group_05/total_prime_aged) %>%
    mutate(age_group_06 = age_group_06/total_prime_aged) %>%
    mutate(age_group_07 = age_group_07/total_prime_aged) %>%
    mutate(age_group_08 = age_group_08/total_prime_aged) %>%
    mutate(age_group_09 = age_group_09/total_prime_aged) %>%
    mutate(age_group_10 = age_group_10/total_prime_aged) %>%
    mutate(age_group_11 = age_group_11/total_prime_aged) %>%
    mutate(age_group_12 = age_group_12/total_prime_aged) %>%
    select(-c(total_prime_aged))


# Second make this a matrix (the same for each county)
seer_19_1997_prime_aged = do.call("rbind", replicate(nrow(seer_19_1997), temp, simplify = FALSE))


# Write the file
fwrite(seer_19_1997_prime_aged, file = "data/modified-ap3/pop_1997_prime_aged.csv", col.names = FALSE)


### Mortality rates
# From CDC Wonder and imputed. They use five-year age averages. 
# Source: https://www.pnas.org/content/pnas/suppl/2019/09/05/1905030116.DCSupplemental/pnas.1905030116.sapp.pdf
# > Center for Disease Control and Prevention (CDC) data on baseline mortality rates for age groups in 5-year intervals are used for marginal damage calculations in the AP3 model. They were downloaded from CDC’s WONDER database (5) in August 2018. CDC does not publish mortality rates for certain counties and age groups, so a fraction of rates had to be imputed based on state, regional or national averages.


county_mort_1997 = fread("raw/CDC/mortality-county-1997.txt") %>%
  mutate(fips_char = sprintf("%05d", `County Code`)) %>%
  mutate(state = as.numeric(substr(fips_char, 1, 2))) %>%
  mutate(county_rate = as.numeric(`Crude Rate`))

state_mort_1997 = fread("raw/CDC/mortality-state-1997.txt") %>%
  rename(state = `State Code`) %>%
  rename(state_rate = "Crude Rate") %>%
  mutate(state_date = as.numeric(state_rate)) %>%
  select(state, state_rate, `Age Group`)  %>%
  mutate(census_division = case_when(state %in% c(9, 23, 25, 33, 44, 50) ~ 1,
                                     state %in% c(34, 36, 42) ~ 2,
                                     state %in% c(17, 18, 26, 39, 55) ~ 3,
                                     state %in% c(19, 20, 27, 29, 31, 38, 46) ~ 4,
                                     state %in% c(10, 11, 12, 13, 24, 37, 45, 51, 54) ~ 5,
                                     state %in% c(1, 21, 28, 47) ~ 6,
                                     state %in% c(5, 22, 40, 48) ~ 7,
                                     state %in% c(4, 8, 16, 30, 32, 35, 49, 56) ~ 8,
                                     state %in% c(2, 6, 15, 41, 53) ~ 9,
                                     TRUE ~ 0))

division_mort_1997 = fread("raw/CDC/mortality-division-1997.txt") %>%
  select(Division, `Age Group`, `Crude Rate`) %>%
  mutate(division_rate = as.numeric(`Crude Rate`)) %>%
  mutate(census_division = case_when( Division == "Division 1: New England" ~ 1, 
                                      Division == "Division 2: Middle Atlantic"  ~ 2,
                                      Division == "Division 3: East North Central" ~ 3,
                                      Division == "Division 4: West North Central" ~ 4,
                                      Division == "Division 5: South Atlantic" ~ 5,
                                      Division == "Division 6: East South Central" ~ 6,
                                      Division == "Division 7: West South Central" ~ 7,
                                      Division == "Division 8: Mountain" ~ 8,
                                      Division == "Division 9: Pacific" ~ 9,
                                      TRUE ~ 0)) %>%
  select(census_division, `Age Group`, division_rate)


mort_1997 = county_mort_1997 %>%
  left_join(state_mort_1997) %>%
  left_join(division_mort_1997) %>%
  mutate(county_rate = as.numeric(county_rate)) %>%
  mutate(state_rate = as.numeric(state_rate)) %>%
  mutate(division_rate = as.numeric(state_rate)) %>%
  mutate(imputed_state_death_rate = ifelse(is.na(state_rate), division_rate, state_rate)) %>%
  mutate(imputed_death_rate = ifelse(is.na(county_rate), imputed_state_death_rate, county_rate)) %>%
  filter(`Age Group Code` != "NS") %>%
  select(imputed_death_rate, `County Code`, `Age Group`) %>%
  rename(fips = `County Code`)

mort_1997 = fips_order %>%
  left_join(mort_1997) %>%
  pivot_wider(id_cols = fips, 
              values_from = imputed_death_rate, 
              names_from = `Age Group`) 


mort_1997 = mort_1997 %>%
  select(-c(fips, "NA")) %>%
  mutate(`25-29 years` = `25-34 years`) %>%
  rename(`30-34 years` = `25-34 years`) %>%
  mutate(`35-39 years` = `35-44 years`) %>%
  rename(`40-44 years` = `35-44 years`) %>%
  mutate(`45-49 years` = `45-54 years`) %>%
  rename(`50-54 years` = `45-54 years`) %>%
  mutate(`55-59 years` = `55-64 years`) %>%
  rename(`60-64 years` = `55-64 years`) %>%
  mutate(`65-69 years` = `65-74 years`) %>%
  rename(`70-74 years` = `65-74 years`) %>%
  mutate(`75-79 years` = `75-84 years`) %>%
  rename(`80-84 years` = `75-84 years`) 

mort_1997 = mort_1997 %>%
  relocate("< 1 year",    "1-4 years",   "5-9 years",  "10-14 years", "15-19 years", "20-24 years", "25-29 years", "30-34 years", "35-39 years", "40-44 years",  "45-49 years",  "50-54 years",  "55-59 years",  "60-64 years", "65-69 years",  "70-74 years", "75-79 years" ,"80-84 years", "85+ years" ) %>%
  mutate_all(funs(. /100000))


# CDC provides death rates in 13 age groups, but SEER is in 19. Now I need to replicate the data in a few spots so it is consistent. 

fwrite(mort_1997, "data/modified-ap3/mort_1997.csv", col.names = FALSE)




### Emissions totals 

# We're just going to assume that all point sources are low since 96% were in 2014. 

nei1997facility = fread("raw/nei/1997FacilitySummarymade08102005.txt") %>%
    select(STATE_FIPS, COUNTY_FIPS, "NH3", "NOX", "SO2", "VOC", PM25) %>%
    collap(by = ~STATE_FIPS + COUNTY_FIPS, fsum) %>%
    replace(is.na(.), 0) %>%
    mutate(state = sprintf("%02d", STATE_FIPS)) %>%
    mutate(county = sprintf("%03d", COUNTY_FIPS)) %>%
    mutate(fips = as.numeric(paste0(state, county))) %>%
    select(-c(STATE_FIPS, state, COUNTY_FIPS, county))

nei1997facility = fips_order %>%
    left_join(nei1997facility) %>%
    replace(is.na(.), 0) %>%
    mutate(zeros = 0) %>%
    relocate(fips, "NH3", "NOX", zeros, PM25, "SO2", "VOC")  %>%
    select(-c(fips))


# low.ap3<-low.ap3[, c('NH3', 'NOx', 'empty', 'PM25', 'SO2', 'VOC')]
# Source: line 328, https://github.com/ptschofen/PNAS_SectoralMortality/blob/master/R%20Code/NEI%20Data%20Preprocessing/FacilityDataProcessing.R


fwrite(nei1997facility, "data/modified-ap3/low_1997.csv", col.names = FALSE)


